In [1]:
import numpy as np
import pandas as pd
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import tree
from IPython.display import Image

py.offline.init_notebook_mode()
cf.set_config_file(offline=True)

Import data and rename columns so they are easier to process

In [2]:
df = pd.read_csv('data/elastography.csv')
df.rename(columns={'M/F':'sex', 'Colour ':'Color', 'Av Strain':'AV_Strain'}, inplace=True)

One variable classification was accidentally deleted in the spreadsheet so re-insert it here.

In [3]:
df.ix[40,'Classification'] = 0

Plot histograms of each variable to get an idea of the distributions. All variables are catgorical and the coding convesion can be found further down in the 'readable_var_names_mapper' variable.

In [4]:
cols = list(df.columns)
cols.remove('Study ID')
df[cols].iplot(kind='histogram', subplots=True, shape=(6,2))
Drawing...

In order to use a random forest classifier the categorical predictor variables need to be recoded into a columns of dummy variables. I.e. the wall column with possible values 0 and 1 is converted into two seperate columns 'Wall_0' and 'Wall_1' with the value of the variable indicated by a 1 in the appropriate column.

I am also recoding the classifications so that lipomas are included in the benign category to give more power. The analysis is then concerned solely with prediciting malignant over benign masses.

In [5]:
age_dummies = pd.get_dummies(df['Age']).rename(columns=lambda x: 'Age_' + str(x))
size_dummies = pd.get_dummies(df['Size']).rename(columns=lambda x: 'Size_' + str(x))
depth_dummies = pd.get_dummies(df['Depth']).rename(columns=lambda x: 'Depth_' + str(x))
contrast_dummies = pd.get_dummies(df['Contrast']).rename(columns=lambda x: 'Contrast_' + str(x))
wall_dummies = pd.get_dummies(df['Wall']).rename(columns=lambda x: 'Wall_' + str(x))
doppler_dummies = pd.get_dummies(df['Doppler']).rename(columns=lambda x: 'Doppler_' + str(x))
elastography_dummies = pd.get_dummies(df['Elastography']).rename(columns=lambda x: 'Elastography_' + str(x))
sex_dummies = pd.get_dummies(df['sex']).rename(columns=lambda x: 'Sex_' + str(x))
color_dummies = pd.get_dummies(df['Color']).rename(columns=lambda x: 'Color_' + str(x))
site_dummies = pd.get_dummies(df['Site']).rename(columns=lambda x: 'Site_' + str(x))
strain_dummies = pd.get_dummies(df['AV_Strain']).rename(columns=lambda x: 'Strain_' + str(x))

dummies_list = [age_dummies,size_dummies,depth_dummies,contrast_dummies,wall_dummies,doppler_dummies,elastography_dummies,sex_dummies,color_dummies,site_dummies,strain_dummies]
var_names = []
for var in dummies_list:
    for col in var:
        var_names.append(col) 

old_new_map = {0:0,1:0,2:0,3:1}
df['Binary_Classification'] = df['Classification'].map(old_new_map)

df_rc = pd.DataFrame()
df_rc['y'] = df['Binary_Classification']

df_rc = df_rc.join(age_dummies)
df_rc = df_rc.join(size_dummies)
df_rc = df_rc.join(depth_dummies)
df_rc = df_rc.join(contrast_dummies)
df_rc = df_rc.join(wall_dummies)
df_rc = df_rc.join(doppler_dummies)
df_rc = df_rc.join(elastography_dummies)
df_rc = df_rc.join(sex_dummies)
df_rc = df_rc.join(color_dummies)
df_rc = df_rc.join(site_dummies)
df_rc = df_rc.join(strain_dummies)

Create a dictionary to map back to a readable form when neccesary.

In [6]:
readable_var_names_mapper = {
    'Age_0':'Age < 20',
    'Age_1':'Age 21-30',
    'Age_2':'Age 31-40',
    'Age_3':'Age 41-50',
    'Age_4':'Age 51-60',
    'Age_5':'Age 61-70',
    'Age_6':'Age >71',
    'Size_0':'Size 0-5',
    'Size_1':'Size 6 - 10',
    'Size_2':'Size 11 - 20',
    'Size_3':'Size > 21',
    'Depth_0':'Depth = superficial',
    'Depth_1':'Depth = deep',
    'Contrast_0':'Contrast = hypo',
    'Contrast_1':'Contrast = iso',
    'Contrast_2':'Contrast = hyper',
    'Contrast_3':'Contrast = complex',
    'Wall_0':'Wall = regular',
    'Wall_1':'Wall = irregular',
    'Doppler_0':'Doppler = No',
    'Doppler_1':'Doppler = Yes',
    'Elastography_0':'Elastography = homo',
    'Elastography_1':'Elastography = hetero',
    'Sex_0':'Sex = male',
    'Sex_1':'Sex = female',
    'Color_0':'Color = green',
    'Color_1':'Color = green/yellow',
    'Color_2':'Color = green/red',
    'Color_3':'Color = green/yellow/blue',
    'Color_4':'Color = green/blue',
    'Color_5':'Color = blue',
    'Color_6':'Color = green/black',
    'Color_7':'Color = green/blue/black',
    'Color_8':'Color = blue/black',
    'Color_9':'Color = black',
    'Site_0':'Site = head/neck',
    'Site_1':'Site = upper torso',
    'Site_2':'Site = lower torso',
    'Site_3':'Site = arm',
    'Site_4':'Site = leg',
    'Strain_0':'Strain < 1',
    'Strain_1':'Strain 1-5',
    'Strain_2':'Strain 6-10',
    'Strain_3':'Strain > 11'
}

def prettify_var_names(var_list):
    readable_var_list = []
    for item in var_list:
        readable_var_list.append(readable_var_names_mapper[item])
    return readable_var_list

readable_var_names = prettify_var_names(var_names)

Partition into training (80%) and testing (20%) data sets.

In [7]:
y = df_rc['y']
x_cols = list(df_rc.columns)
x_cols.remove('y')
X = df_rc[x_cols]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

Create random forest classifier and fit model using training dataset.

In [8]:
clf = RandomForestClassifier()
clf = clf.fit(X_train,y_train.ravel())

Calculate the classification accuracy of the model using the testing dataset.

In [9]:
print('Classification accuracy is {}%'.format(100*round(clf.score(X_test,y_test),2)))
Classification accuracy is 84.0%

Produce a bar plot showing the contribution of the different levels of each variable to the classification.

In [10]:
features = clf.feature_importances_
features = features * 100
var_dict = sorted(zip(map(lambda x: round(x,4), features), readable_var_names),reverse=True)
sorted_idx = features.argsort()
pos = np.arange(sorted_idx.shape[0]) + .5
feature_ranks = sorted(zip(features[sorted_idx], np.asanyarray(prettify_var_names(x_cols))[sorted_idx]),reverse=True)

df_features = pd.DataFrame(feature_ranks,columns=['contribution %','feature'])
df_features.set_index(['feature'],inplace=True)
df_features.sort_values('contribution %',ascending=False)
df_features = df_features[df_features['contribution %'] > 2.0]
df_features.iplot(kind='bar',legend=True)
Drawing...

Calculate the sensitivity, specificity and general accuracy of the model

In [11]:
predictions = clf.predict(X_test)
y_vals = y_test.ravel()
confusion_matrix = pd.crosstab(predictions, y_vals, rownames=['y_vals'], colnames=['predictions'])
rf_metrics = {
    'sensitivity': round(float(confusion_matrix[1][1]) / (float(confusion_matrix[0][1] + confusion_matrix[1][1])),2),
    'specificity': round(float(confusion_matrix[0][0]) / (float(confusion_matrix[0][0] + confusion_matrix[1][0])),2),
    'accuracy': round(float(confusion_matrix[0][0] + confusion_matrix[1][1]) / float(confusion_matrix[0][0] + confusion_matrix[0][1] + confusion_matrix[1][0] + confusion_matrix[1][1]),2)
}
In [12]:
print(rf_metrics)
{'sensitivity': 0.77, 'accuracy': 0.84, 'specificity': 0.89}

Produce graphical plots of decision trees

In [13]:
i = 0
list_dot_filenames = []
list_png_filenames = []
for est in clf.estimators_:
    dot_filename = 'tree{}.dot'.format(str(i))
    list_dot_filenames.append(dot_filename)
    png_filename = 'tree{}.png'.format(str(i))
    list_png_filenames.append(png_filename)
    tree.export_graphviz(est,out_file=dot_filename)  
    i += 1

The following is an example of a decision tree used in the classifer.

In [14]:
Image(filename='tree1.png')
Out[14]:

This analysis has shown that a random forest using the variables provided can acheive classification accuracy of whether or not a tumour is malignant of around 80%. This is acheived using ultrasound measurements alone.

The next step would be to attempt logistic regression in order to quantify the affects of each variable indidually and to assess what combination of variables are most useful for predicting whether or not a tumour is malignant.

It would also be good to re-run the analysis to see if the classifier is able to differentiate between normal benign tumours and lipomas.